# Install the required packages only if missing and suppress redundant output
packages <- c("tidyverse", "caret", "knitr", "kableExtra", "skimr", "ggplot2","dplyr","ISLR","tree","rpart","rpart.plot","randomForest","gbm","doParallel","naniar","patchwork","MASS","scales","plotly","reshape2","e1071","ggcorrplot","gt","nortest","gridExtra")
new_packages <- setdiff(packages, rownames(installed.packages()))
if (length(new_packages) > 0) install.packages(new_packages, quietly = TRUE)
# Load the required packages quietly
invisible(lapply(packages, function(pkg) suppressPackageStartupMessages(library(pkg, character.only = TRUE))))
#Import the dataset. Change the file path to where you saved your bank.txt file
bank <- read_csv("F:/ECON/562_Analytics_2/Final Project/bank.txt",
col_types = cols(b_tgt = col_character(),
int_tgt = col_number(), cnt_tgt = col_double(),
demog_homeval = col_number(), demog_inc = col_number(),
rfm1 = col_number(), rfm2 = col_number(),
rfm3 = col_number(), rfm4 = col_number(),
demog_genf = col_character(), demog_genm = col_character(),
dataset = col_character()))
# Display the first 10 rows of the dataset
bank %>%
slice(1:10) %>%
kable("html", caption = "First 10 Rows of the Bank Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center")
| b_tgt | int_tgt | cnt_tgt | cat_input1 | cat_input2 | demog_age | demog_ho | demog_homeval | demog_inc | demog_pr | rfm1 | rfm2 | rfm3 | rfm4 | rfm5 | rfm6 | rfm7 | rfm8 | rfm9 | rfm10 | rfm11 | rfm12 | demog_genf | demog_genm | account | dataset |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 7000 | NA | X | A | NA | 0 | 57600 | 52106 | 24 | 5.71 | 5.36 | 5.25 | 10 | 7 | 22 | 4 | 6 | 5 | 20 | 9 | 92 | 0 | 1 | 1e+08 | 1 |
| 1 | 7000 | 2 | X | A | NA | 0 | 57587 | 52106 | 24 | 5.71 | 5.36 | 5.25 | 10 | 7 | 22 | 4 | 6 | 5 | 20 | 9 | 92 | 0 | 1 | 1e+08 | 3 |
| 1 | 15000 | 2 | X | A | NA | 0 | 44167 | 42422 | 0 | 12.80 | 10.75 | 13.00 | 12 | 5 | 16 | 3 | 8 | 16 | 27 | 11 | 91 | 0 | 1 | 1e+08 | 1 |
| 0 | NA | 0 | X | A | 68 | 0 | 90587 | 59785 | 32 | 21.60 | 20.10 | 20.00 | 25 | 5 | 21 | 2 | 7 | 15 | 19 | 9 | 123 | 1 | 0 | 1e+08 | 2 |
| 0 | NA | 0 | X | A | NA | 0 | 100313 | 0 | 0 | 7.33 | 4.68 | 7.60 | 10 | 6 | 38 | 5 | 19 | 24 | 13 | 6 | 128 | 1 | 0 | 1e+08 | 3 |
| 0 | NA | 0 | X | A | 26 | 0 | 26622 | 34444 | 0 | 25.00 | 7.62 | NA | 25 | 1 | 13 | 0 | 4 | 26 | 13 | 5 | 91 | 0 | 1 | 1e+08 | 1 |
| 0 | NA | 0 | X | A | 74 | 1 | 95496 | 0 | 0 | 16.00 | 10.22 | NA | 16 | 1 | 9 | 0 | 2 | 19 | 20 | 7 | 90 | 1 | 0 | 1e+08 | 1 |
| 0 | NA | 0 | X | A | 83 | 0 | 65814 | 32597 | 34 | 14.33 | 11.41 | 14.00 | 15 | 3 | 22 | 2 | 10 | 20 | 13 | 8 | 109 | 1 | 0 | 1e+08 | 2 |
| 0 | NA | 0 | X | A | 70 | 0 | 26007 | 33748 | 38 | 16.00 | 20.10 | 17.00 | 17 | 4 | 30 | 1 | 7 | 17 | 26 | 11 | 104 | 0 | 1 | 1e+08 | 1 |
| 0 | NA | 0 | X | A | 77 | 1 | 0 | 0 | 0 | 20.00 | 11.36 | NA | 15 | 2 | 11 | 0 | 1 | 14 | 29 | 10 | 90 | 0 | 1 | 1e+08 | 3 |
str(bank)
## spc_tbl_ [1,060,038 × 26] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ b_tgt : chr [1:1060038] "1" "1" "1" "0" ...
## $ int_tgt : num [1:1060038] 7000 7000 15000 NA NA NA NA NA NA NA ...
## $ cnt_tgt : num [1:1060038] NA 2 2 0 0 0 0 0 0 0 ...
## $ cat_input1 : chr [1:1060038] "X" "X" "X" "X" ...
## $ cat_input2 : chr [1:1060038] "A" "A" "A" "A" ...
## $ demog_age : num [1:1060038] NA NA NA 68 NA 26 74 83 70 77 ...
## $ demog_ho : num [1:1060038] 0 0 0 0 0 0 1 0 0 1 ...
## $ demog_homeval: num [1:1060038] 57600 57587 44167 90587 100313 ...
## $ demog_inc : num [1:1060038] 52106 52106 42422 59785 0 ...
## $ demog_pr : num [1:1060038] 24 24 0 32 0 0 0 34 38 0 ...
## $ rfm1 : num [1:1060038] 5.71 5.71 12.8 21.6 7.33 ...
## $ rfm2 : num [1:1060038] 5.36 5.36 10.75 20.1 4.68 ...
## $ rfm3 : num [1:1060038] 5.25 5.25 13 20 7.6 NA NA 14 17 NA ...
## $ rfm4 : num [1:1060038] 10 10 12 25 10 25 16 15 17 15 ...
## $ rfm5 : num [1:1060038] 7 7 5 5 6 1 1 3 4 2 ...
## $ rfm6 : num [1:1060038] 22 22 16 21 38 13 9 22 30 11 ...
## $ rfm7 : num [1:1060038] 4 4 3 2 5 0 0 2 1 0 ...
## $ rfm8 : num [1:1060038] 6 6 8 7 19 4 2 10 7 1 ...
## $ rfm9 : num [1:1060038] 5 5 16 15 24 26 19 20 17 14 ...
## $ rfm10 : num [1:1060038] 20 20 27 19 13 13 20 13 26 29 ...
## $ rfm11 : num [1:1060038] 9 9 11 9 6 5 7 8 11 10 ...
## $ rfm12 : num [1:1060038] 92 92 91 123 128 91 90 109 104 90 ...
## $ demog_genf : chr [1:1060038] "0" "0" "0" "1" ...
## $ demog_genm : chr [1:1060038] "1" "1" "1" "0" ...
## $ account : num [1:1060038] 1e+08 1e+08 1e+08 1e+08 1e+08 ...
## $ dataset : chr [1:1060038] "1" "3" "1" "2" ...
## - attr(*, "spec")=
## .. cols(
## .. b_tgt = col_character(),
## .. int_tgt = col_number(),
## .. cnt_tgt = col_double(),
## .. cat_input1 = col_character(),
## .. cat_input2 = col_character(),
## .. demog_age = col_double(),
## .. demog_ho = col_double(),
## .. demog_homeval = col_number(),
## .. demog_inc = col_number(),
## .. demog_pr = col_double(),
## .. rfm1 = col_number(),
## .. rfm2 = col_number(),
## .. rfm3 = col_number(),
## .. rfm4 = col_number(),
## .. rfm5 = col_double(),
## .. rfm6 = col_double(),
## .. rfm7 = col_double(),
## .. rfm8 = col_double(),
## .. rfm9 = col_double(),
## .. rfm10 = col_double(),
## .. rfm11 = col_double(),
## .. rfm12 = col_double(),
## .. demog_genf = col_character(),
## .. demog_genm = col_character(),
## .. account = col_double(),
## .. dataset = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
summary(bank)
## b_tgt int_tgt cnt_tgt cat_input1
## Length:1060038 Min. : 0 Min. :0.0000 Length:1060038
## Class :character 1st Qu.: 6000 1st Qu.:0.0000 Class :character
## Mode :character Median : 10000 Median :0.0000 Mode :character
## Mean : 11236 Mean :0.3118
## 3rd Qu.: 16000 3rd Qu.:0.0000
## Max. :500000 Max. :6.0000
## NA's :848529 NA's :1
## cat_input2 demog_age demog_ho demog_homeval
## Length:1060038 Min. :-1.00 Min. :0.0000 Min. : 0
## Class :character 1st Qu.:46.00 1st Qu.:0.0000 1st Qu.: 51107
## Mode :character Median :60.00 Median :1.0000 Median : 73880
## Mean :58.72 Mean :0.5503 Mean :106104
## 3rd Qu.:73.00 3rd Qu.:1.0000 3rd Qu.:122214
## Max. :89.00 Max. :1.0000 Max. :600067
## NA's :266861
## demog_inc demog_pr rfm1 rfm2
## Min. : 0 Min. : 0.00 Min. : 0.00 Min. : 1.58
## 1st Qu.: 26084 1st Qu.: 25.00 1st Qu.: 10.00 1st Qu.: 8.40
## Median : 43174 Median : 31.00 Median : 15.00 Median : 11.67
## Mean : 40369 Mean : 30.57 Mean : 16.09 Mean : 13.35
## 3rd Qu.: 56896 3rd Qu.: 37.00 3rd Qu.: 20.00 3rd Qu.: 15.50
## Max. :200007 Max. :101.00 Max. :3713.31 Max. :650.00
##
## rfm3 rfm4 rfm5 rfm6
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 10.00 1st Qu.: 10.00 1st Qu.: 1.000 1st Qu.: 3.000
## Median : 14.00 Median : 15.00 Median : 2.000 Median : 7.000
## Mean : 15.31 Mean : 17.47 Mean : 2.908 Mean : 9.539
## 3rd Qu.: 20.00 3rd Qu.: 20.00 3rd Qu.: 4.000 3rd Qu.: 13.000
## Max. :3713.31 Max. :10000.00 Max. :18.000 Max. :127.000
## NA's :225786
## rfm7 rfm8 rfm9 rfm10
## Min. : 0.000 Min. : 0.000 Min. : 2.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.:16.00 1st Qu.:11.00
## Median : 1.000 Median : 4.000 Median :18.00 Median :12.00
## Mean : 1.667 Mean : 5.026 Mean :18.35 Mean :12.89
## 3rd Qu.: 2.000 3rd Qu.: 7.000 3rd Qu.:20.00 3rd Qu.:13.00
## Max. :11.000 Max. :46.000 Max. :29.00 Max. :77.00
##
## rfm11 rfm12 demog_genf demog_genm
## Min. : 0.000 Min. : 0.00 Length:1060038 Length:1060038
## 1st Qu.: 5.000 1st Qu.: 33.00 Class :character Class :character
## Median : 6.000 Median : 64.00 Mode :character Mode :character
## Mean : 5.359 Mean : 68.13
## 3rd Qu.: 6.000 3rd Qu.:103.00
## Max. :22.000 Max. :571.00
##
## account dataset
## Min. :100000001 Length:1060038
## 1st Qu.:100265010 Class :character
## Median :100530020 Mode :character
## Mean :100530020
## 3rd Qu.:100795029
## Max. :101060038
##
| Data Type | Description | Columns |
|---|---|---|
| Numeric | Continuous variables or discrete numeric inputs. | 18 Columns: int_tgt, cnt_tgt,
rfm1-12, demog_age,
demog_homeval, demog_inc,
demog_pr |
| Factor | Categorical with fixed levels | 6 Columns: b_tgt, cat_input1,
cat_input2, demog_ho, demog_genf,
demog_genm |
| Identifier | Unique identifiers for each row | 1 Column: account |
| Partition | Subset of the dataset | 1 Column: dataset |
Variable Descriptions
| Variable | Type | Description |
|---|---|---|
| Target Variables | ||
B_TGT |
Binary | Tried a New Product (Yes/No — may be recoded as 1/0) |
INT_TGT |
Numeric | Total New Sales |
CNT_TGT |
Count | Count of New Products Purchased |
| Categorical Predictors | ||
CAT_INPUT1 |
Categorical | Account Activity |
CAT_INPUT2 |
Categorical | Customer Value Level |
| RFM Interval Inputs | ||
RFM1 |
Numeric | Average Sales Past 3 Years |
RFM2 |
Numeric | Average Sales Lifetime |
RFM3 |
Numeric | Avg Sales Past 3 Years - Direct Promo Response |
RFM4 |
Numeric | Last Product Purchase Amount |
RFM5 |
Numeric | Count Purchased Past 3 Years |
RFM6 |
Numeric | Count Purchased Lifetime |
RFM7 |
Numeric | Count Purchased Past 3 Years - Direct Promo Response |
RFM8 |
Numeric | Count Purchased Lifetime - Direct Promo Response |
RFM9 |
Numeric | Months Since Last Purchase |
RFM10 |
Numeric | Count Total Promos Past Year |
RFM11 |
Numeric | Count Direct Promos Past Year |
RFM12 |
Numeric | Customer Tenure |
| Demographic Inputs | ||
DEMOG_AGE |
Numeric | Customer Age |
DEMOG_GENF |
Binary | Female (Yes/No) |
DEMOG_GENM |
Binary | Male (Yes/No) |
DEMOG_HO |
Binary | Homeowner (Yes/No) |
DEMOG_HOMEVAL |
Numeric | Home Value |
DEMOG_INC |
Numeric | Income |
DEMOG_PR |
Numeric | Geographical Retirement Percentage |
General Structure
The dataset contains 1,060,038 rows and 26 columns.
The dataset is divided into three subsets: training (1), validation (2), and testing (3).
3 target variables
Potential Issues
Many missing values
Redundant Gender columns: do not need both male and female. Will only keep the female column.
Zero values in demog_homeval,
demog_inc, and demog_pr columns. These values
are suspicious and may be treated as missing values.
# Missing Value Table
bank %>%
miss_var_summary() %>%
mutate(pct_miss = round(pct_miss, 2)) %>%
kable("html", caption = "Missing Data Summary for 'bank' Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F,
position = "center")
| variable | n_miss | pct_miss |
|---|---|---|
| int_tgt | 848529 | 80.0 |
| demog_age | 266861 | 25.2 |
| rfm3 | 225786 | 21.3 |
| cnt_tgt | 1 | 0 |
| b_tgt | 0 | 0 |
| cat_input1 | 0 | 0 |
| cat_input2 | 0 | 0 |
| demog_ho | 0 | 0 |
| demog_homeval | 0 | 0 |
| demog_inc | 0 | 0 |
| demog_pr | 0 | 0 |
| rfm1 | 0 | 0 |
| rfm2 | 0 | 0 |
| rfm4 | 0 | 0 |
| rfm5 | 0 | 0 |
| rfm6 | 0 | 0 |
| rfm7 | 0 | 0 |
| rfm8 | 0 | 0 |
| rfm9 | 0 | 0 |
| rfm10 | 0 | 0 |
| rfm11 | 0 | 0 |
| rfm12 | 0 | 0 |
| demog_genf | 0 | 0 |
| demog_genm | 0 | 0 |
| account | 0 | 0 |
| dataset | 0 | 0 |
Important Note:
The dataset contains a significant amount of missing values,
particularly in the int_tgt variable, which has
80% missing values. This is not a major concern due to
the nature of the modelling focus. The int_tgt variable is
set to N/A for any b_tgt that is “no” and since we will be
modelling for how much a customer will spend given they have purchased a
new product, we can effectively ignore all missing values for this
variable.
The other two highly missing variables are demog_age
and rfm3, which have 25.2% and
21.3% missing values, respectively. These variables
will be imputed using the median or mean values depending on the
normality due to the importance of these variables.
# Numerical variable names
numerical_vars <- c("int_tgt", "cnt_tgt", "rfm1", "rfm2", "rfm3", "rfm4",
"rfm5", "rfm6", "rfm7", "rfm8", "rfm9", "rfm10",
"rfm11", "rfm12", "demog_age", "demog_homeval",
"demog_inc", "demog_pr")
# Function to compute summary statistics
compute_summary_statistics <- function(bank, numerical_vars) {
summary_stats <- bank %>%
dplyr::select(dplyr::all_of(numerical_vars)) %>%
dplyr::summarise(dplyr::across(
everything(),
list(
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
max = ~max(., na.rm = TRUE),
skewness = ~e1071::skewness(., na.rm = TRUE)
),
.names = "{col}_{fn}"
)) %>%
tidyr::pivot_longer(cols = everything(),
names_to = c("Variable", "Statistic"),
names_pattern = "(.*)_(.*)",
values_to = "Value") %>%
tidyr::pivot_wider(names_from = Statistic, values_from = Value) %>%
dplyr::mutate(dplyr::across(where(is.numeric), ~round(.x, 2)))
return(summary_stats)
}
# Run the function
summary_statistics <- compute_summary_statistics(bank, numerical_vars)
# Display nicely formatted HTML table
summary_statistics %>%
kable("html", caption = "Summary Statistics for Selected Numerical Variables") %>%
kable_styling(full_width = TRUE, bootstrap_options = c("striped", "hover")) %>%
scroll_box(width = "100%", height = "500px")
| Variable | mean | sd | median | min | max | skewness |
|---|---|---|---|---|---|---|
| int_tgt | 11235.87 | 8491.80 | 10000.00 | 0.00 | 500000.00 | 13.30 |
| cnt_tgt | 0.31 | 0.70 | 0.00 | 0.00 | 6.00 | 2.40 |
| rfm1 | 16.09 | 19.30 | 15.00 | 0.00 | 3713.31 | 103.31 |
| rfm2 | 13.35 | 9.47 | 11.67 | 1.58 | 650.00 | 11.12 |
| rfm3 | 15.31 | 18.97 | 14.00 | 0.00 | 3713.31 | 114.92 |
| rfm4 | 17.47 | 37.55 | 15.00 | 0.00 | 10000.00 | 207.18 |
| rfm5 | 2.91 | 2.03 | 2.00 | 0.00 | 18.00 | 1.23 |
| rfm6 | 9.54 | 8.47 | 7.00 | 0.00 | 127.00 | 1.91 |
| rfm7 | 1.67 | 1.53 | 1.00 | 0.00 | 11.00 | 1.22 |
| rfm8 | 5.03 | 4.51 | 4.00 | 0.00 | 46.00 | 1.42 |
| rfm9 | 18.35 | 4.02 | 18.00 | 2.00 | 29.00 | -0.60 |
| rfm10 | 12.89 | 4.61 | 12.00 | 0.00 | 77.00 | 2.86 |
| rfm11 | 5.36 | 1.36 | 6.00 | 0.00 | 22.00 | 0.32 |
| rfm12 | 68.13 | 37.35 | 64.00 | 0.00 | 571.00 | 0.30 |
| demog_age | 58.72 | 16.85 | 60.00 | -1.00 | 89.00 | -0.36 |
| demog_homeval | 106103.55 | 93289.97 | 73880.00 | 0.00 | 600067.00 | 2.46 |
| demog_inc | 40368.69 | 28029.02 | 43174.00 | 0.00 | 200007.00 | 0.23 |
| demog_pr | 30.57 | 11.53 | 31.00 | 0.00 | 101.00 | -0.15 |
# List of categorical variable names
categorical_vars <- c("b_tgt", "cat_input1", "cat_input2", "demog_ho", "demog_genf")
# Function to compute frequency tables
compute_frequency_tables <- function(bank, categorical_vars) {
freq_tables <- lapply(categorical_vars, function(var) {
bank %>%
dplyr::select(dplyr::all_of(var)) %>%
dplyr::filter(!is.na(.data[[var]])) %>%
dplyr::group_by(.data[[var]]) %>%
dplyr::summarise(Count = n(), .groups = "drop") %>%
dplyr::mutate(
Level = as.character(.data[[var]]), # Force Level to be character
Percent = round(100 * Count / sum(Count), 2),
Variable = var
) %>%
dplyr::select(Variable, Level, Count, Percent)
})
# Bind all tables into one
freq_tables_df <- dplyr::bind_rows(freq_tables)
return(freq_tables_df)
}
# Run the function
frequency_tables <- compute_frequency_tables(bank, categorical_vars)
# Display nicely formatted HTML table
frequency_tables %>%
kable("html", caption = "Frequency Tables for Selected Categorical Variables") %>%
kable_styling(full_width = TRUE, bootstrap_options = c("striped", "hover")) %>%
scroll_box(width = "100%", height = "500px")
| Variable | Level | Count | Percent |
|---|---|---|---|
| b_tgt | 0 | 848529 | 80.05 |
| b_tgt | 1 | 211509 | 19.95 |
| cat_input1 | X | 831371 | 78.43 |
| cat_input1 | Y | 77847 | 7.34 |
| cat_input1 | Z | 150820 | 14.23 |
| cat_input2 | A | 188398 | 17.77 |
| cat_input2 | B | 192382 | 18.15 |
| cat_input2 | C | 169550 | 15.99 |
| cat_input2 | D | 122282 | 11.54 |
| cat_input2 | E | 387426 | 36.55 |
| demog_ho | 0 | 476741 | 44.97 |
| demog_ho | 1 | 583297 | 55.03 |
| demog_genf | 0 | 464288 | 43.80 |
| demog_genf | 1 | 595750 | 56.20 |
# Define all numerical variables
numerical_vars <- c("int_tgt", "cnt_tgt", paste0("rfm", 1:12),
"demog_age", "demog_homeval", "demog_inc", "demog_pr")
# Generate a vector of distinct colors
colors <- hue_pal()(length(numerical_vars))
# Create a list to store plots
num_plots <- list()
for (i in seq_along(numerical_vars)) {
var <- numerical_vars[i]
color <- colors[i]
plot_data <- bank %>% filter(!is.na(.data[[var]]))
p <- ggplot(plot_data, aes(x = .data[[var]])) +
geom_histogram(bins = 30, fill = color, color = "black") +
labs(title = paste("Distribution of", var), x = var, y = "Frequency") +
theme_minimal()
num_plots[[i]] <- p
}
# Display in 2x2 grids
num_chunks <- ceiling(length(num_plots) / 4)
for (i in 1:num_chunks) {
idx <- ((i - 1) * 4 + 1):min(i * 4, length(num_plots))
grid.arrange(grobs = num_plots[idx], ncol = 2)
}
# Now do the same for categorical variables
categorical_vars <- c("b_tgt", "cat_input1", "cat_input2", "demog_ho", "demog_genf", "demog_genm")
cat_plots <- list()
for (var in categorical_vars) {
p <- ggplot(bank, aes(x = .data[[var]])) +
geom_bar(fill = "#868686FF") +
labs(title = paste("Distribution of", var), x = var, y = "Count") +
theme_minimal()
cat_plots[[length(cat_plots) + 1]] <- p
}
# Display in 2x2 grids
cat_chunks <- ceiling(length(cat_plots) / 4)
for (i in 1:cat_chunks) {
idx <- ((i - 1) * 4 + 1):min(i * 4, length(cat_plots))
grid.arrange(grobs = cat_plots[idx], ncol = 2)
}
The notable observations from these distributations for consideration:
b_tgt: - This variable exhibits a
significant imbalance, with a large number of “no” responses compared to
“yes”. This is common in marketing datasets where the target event
(e.g., trying a new product) is rare.
int_tgt: - This variable shows high
skewness, indicating a right-skewed distribution. This suggests that
most customers have low total new sales, with a few customers having
very high sales. This chart may also indicate the presence of outliers
by the long right tail. Previously, the summary statistic showed a max
value of 500000, which is likely a significant outlier. This will be
properly addressed in the data cleaning section to return to a normal
distribution as many regression models assume normality.
cnt_tgt: - This variable also shows a
right-skewed distribution, similar to int_tgt. This is due
to most customers having 0 new products purchased, with a few customers
having purchased multiple products. However, the summary statistics show
that the skew of this variable is much lower than int_tgt,
indicating that the distribution is more normal, yet still may benefit
from transformations.
# Compute the correlation matrix for numeric columns in the bank dataset
correlation_matrix <- cor(bank[, sapply(bank, is.numeric)], use = "complete.obs")
# Melt the correlation matrix to long format
melted_correlation <- melt(correlation_matrix)
# Truncate the correlation values to the nearest hundredth
melted_correlation$value <- trunc(melted_correlation$value * 100) / 100
# Create a heatmap using ggplot2
heatmap_plot <- ggplot(data = melted_correlation, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
labs(title = "Correlation Heatmap of Numerical Variables",
x = "Variable",
y = "Variable") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title.x = element_text(size = 12, face = "bold"),
axis.title.y = element_text(size = 12, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10),
axis.text.y = element_text(size = 10)
)
# Convert to interactive plot
interactive_heatmap <- ggplotly(heatmap_plot, tooltip = c("Var1", "Var2", "value"))
# Show the heatmap
interactive_heatmap
# Compute correlation matrix
cor_matrix <- cor(bank %>% select(where(is.numeric)), use = "complete.obs")
# Melt to long format
cor_long <- melt(cor_matrix)
# Remove self-correlations and duplicate pairs
cor_filtered <- cor_long %>%
filter(Var1 != Var2) %>%
rowwise() %>%
mutate(pair = paste(sort(c(Var1, Var2)), collapse = "_")) %>%
distinct(pair, .keep_all = TRUE) %>%
filter(abs(value) > 0.6) %>%
arrange(desc(abs(value)))
# Create a nice table
cor_table <- cor_filtered %>%
select(Variable1 = Var1, Variable2 = Var2, Correlation = value) %>%
mutate(Correlation = round(Correlation, 2)) %>%
kable("html", caption = "Strong Correlations (>|0.6|)", escape = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
# Display
cor_table
| Variable1 | Variable2 | Correlation |
|---|---|---|
| rfm3 | rfm1 | 0.96 |
| rfm2 | rfm1 | 0.91 |
| rfm8 | rfm6 | 0.89 |
| rfm3 | rfm2 | 0.88 |
| rfm7 | rfm5 | 0.80 |
| rfm5 | cnt_tgt | 0.79 |
| rfm4 | rfm2 | 0.76 |
| rfm12 | rfm8 | 0.74 |
| rfm12 | rfm6 | 0.71 |
| rfm4 | rfm1 | 0.69 |
| rfm4 | rfm3 | 0.66 |
| rfm11 | rfm10 | 0.66 |
| rfm7 | cnt_tgt | 0.62 |
| Correlation (Abs. Value) | Strength | Suggestion |
|---|---|---|
| 0.0–0.3 | Weak | Usually not worth flagging |
| 0.3–0.5 | Moderate | Context-dependent |
| 0.5–0.7 | Strong-ish | Often worth a closer look |
| 0.7–0.9+ | Very strong | Good candidates for multicollinearity check |
There is extremely high correlation across the RFM variables,
especially rfm1, rfm2, and
rfm3.
rfm1 (Avg Sales past 3 yrs) is highly correlated
with:
rfm2 (Avg Sales Lifetime): 0.91
rfm3 (Avg Sales Past 3 yrs Dir Promo Resp):
0.96
This is expected as these variables are all related to average sales over different time periods.
rfm6 (Count Purchased Lifetime) and
rfm8 (Count Purchased Lifetime Dir Promo Resp):
0.89
rfm5(Count Purchased Past 3 Yrs) and
cnt_tgt (Target variable): 0.79
Modelling Approach
rfm1 and rfm3, can cause multicollinearity,
which may:
rfm1 or
rfm3, not both), or use dimension reduction techniques such
as:
rfm5)
and the target variable (cnt_tgt) suggests these may be
strong predictors:
# Log transformation for skewed variables
skewed_vars <- c("int_tgt", "cnt_tgt", "rfm1", "rfm2", "rfm3", "rfm4", "rfm10", "demog_homeval")
bank[paste("log_", skewed_vars, sep = "")] <- lapply(bank[skewed_vars], function(x) {
if (any(x <= 0, na.rm = TRUE)) {
x[x <= 0] <- NA
}
log(x)
})
# Check the distribution of the transformed variables
skewed_vars_transformed <- bank %>%
select(starts_with("log_")) %>%
gather(key = "Variable", value = "Value") %>%
filter(!is.na(Value))
ggplot(skewed_vars_transformed, aes(x = Value)) +
geom_histogram(bins = 30, fill = "#868686FF", color = "black") +
facet_wrap(~ Variable, scales = "free") +
labs(title = "Distribution of Log-Transformed Variables", x = "Log Value", y = "Count") +
theme_minimal()
# Function to generate boxplot and return both plot and outlier count
outlier_check_plot <- function(data, var) {
# Generate boxplot
p <- ggplot(data, aes(y = .data[[var]])) +
geom_boxplot(fill = "#00AFBB", color = "black", outlier.color = "red", outlier.shape = 16) +
labs(title = paste("Boxplot of", var), y = var) +
theme_minimal()
# Identify outliers using 1.5*IQR rule
qnt <- quantile(data[[var]], probs = c(0.25, 0.75), na.rm = TRUE)
iqr <- qnt[2] - qnt[1]
lower <- qnt[1] - 1.5 * iqr
upper <- qnt[2] + 1.5 * iqr
outliers <- which(data[[var]] < lower | data[[var]] > upper)
return(list(plot = p, count = length(outliers)))
}
# Define variables to check
variables_to_check <- c(
"log_cnt_tgt", "log_demog_homeval", "log_int_tgt", "log_rfm1", "log_rfm10",
"log_rfm2", "log_rfm3", "log_rfm4", "rfm5", "rfm6", "rfm7", "rfm8",
"rfm9", "rfm11", "rfm12", "demog_age", "demog_inc", "demog_pr"
)
# Store results
results <- lapply(variables_to_check, function(v) outlier_check_plot(bank, v))
names(results) <- variables_to_check
# Extract outlier counts and plots
outlier_counts <- sapply(results, function(x) x$count)
outlier_plots <- lapply(results, function(x) x$plot)
# Print summary sorted by outlier count
outlier_summary <- data.frame(Variable = names(outlier_counts), Outlier_Count = outlier_counts)
outlier_summary <- outlier_summary %>% arrange(desc(Outlier_Count))
print(outlier_summary)
## Variable Outlier_Count
## log_rfm10 log_rfm10 157136
## rfm7 rfm7 121884
## rfm11 rfm11 112803
## demog_pr demog_pr 63087
## rfm9 rfm9 51519
## rfm8 rfm8 47377
## rfm6 rfm6 37524
## log_rfm2 log_rfm2 23628
## log_rfm4 log_rfm4 21103
## demog_inc demog_inc 19668
## rfm5 rfm5 19095
## log_demog_homeval log_demog_homeval 17600
## log_rfm3 log_rfm3 14972
## log_rfm1 log_rfm1 13999
## log_int_tgt log_int_tgt 8613
## demog_age demog_age 312
## rfm12 rfm12 198
## log_cnt_tgt log_cnt_tgt 11
# Display boxplots in 2x2 grid layout
plot_chunks <- ceiling(length(outlier_plots) / 4)
for (i in 1:plot_chunks) {
idx <- ((i - 1) * 4 + 1):min(i * 4, length(outlier_plots))
grid.arrange(grobs = outlier_plots[idx], ncol = 2)
}
Outliers can significantly affect the performance of predictive models. They can distort model parameters, increase variance, and reduce the ability of models to generalize well to unseen data. The strategies outlined below can help manage the influence of outliers:
cnt_tgt, int_tgt, etc.).cnt_tgt, rfm1,
rfm2).cnt_tgt).# indicate the target variables int_tgt, cnt_tgt, and b_tgt
# Create a bar plot for b_tgt with count labels and plain y-axis
b_tgt_plot <- ggplot(bank, aes(x = as.factor(b_tgt))) +
geom_bar(fill = "#00AFBB", color = "black") +
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, size = 4) +
scale_y_continuous(labels = scales::comma) +
labs(title = "Distribution of b_tgt", x = "b_tgt", y = "Count") +
theme_minimal()
# Create histogram for int_tgt with colors and plain y-axis
int_tgt_hist <- ggplot(bank, aes(x = int_tgt)) +
geom_histogram(bins = 30, fill = "#E7B800", color = "black") +
scale_y_continuous(labels = scales::comma) +
labs(title = "Histogram of int_tgt", x = "int_tgt", y = "Frequency") +
theme_minimal()
# Create histogram for cnt_tgt with colors and plain y-axis
cnt_tgt_hist <- ggplot(bank, aes(x = cnt_tgt)) +
geom_histogram(bins = 30, fill = "#FC4E07", color = "black") +
scale_y_continuous(labels = scales::comma) +
labs(title = "Histogram of cnt_tgt", x = "cnt_tgt", y = "Frequency") +
theme_minimal()
# Create a list to store plots
plots <- list()
plots$b_tgt_plot <- b_tgt_plot
plots$int_tgt_hist <- int_tgt_hist
plots$cnt_tgt_hist <- cnt_tgt_hist
# Display the plots
plots$b_tgt_plot
plots$int_tgt_hist
plots$cnt_tgt_hist
# Load gt
library(gt)
# Define target variables
tgt_vars <- c("int_tgt", "cnt_tgt")
# Function to compute summary statistics
compute_summary_statistics <- function(bank, target_vars) {
summary_stats <- bank %>%
dplyr::select(dplyr::all_of(target_vars)) %>%
dplyr::summarise(dplyr::across(
everything(),
list(
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
max = ~max(., na.rm = TRUE),
skewness = ~e1071::skewness(., na.rm = TRUE)
),
.names = "{col}_{fn}"
)) %>%
tidyr::pivot_longer(cols = everything(),
names_to = c("Variable", "Statistic"),
names_pattern = "(.*)_(.*)",
values_to = "Value") %>%
tidyr::pivot_wider(names_from = Statistic, values_from = Value) %>%
dplyr::mutate(dplyr::across(where(is.numeric), ~round(.x, 2)))
return(summary_stats)
}
# Run the function for target variables
summary_statistics_tgt <- compute_summary_statistics(bank, tgt_vars)
# Create a beautiful gt table
summary_statistics_tgt %>%
gt() %>%
fmt_number(
columns = where(is.numeric),
decimals = 2, # Keep 2 decimal places
use_seps = TRUE # Add commas (thousand separators)
) %>%
tab_header(
title = "Summary Statistics for Target Variables"
) %>%
tab_options(
table.width = pct(100),
table.font.size = 14,
heading.align = "center",
data_row.padding = px(6)
)
| Summary Statistics for Target Variables | ||||||
| Variable | mean | sd | median | min | max | skewness |
|---|---|---|---|---|---|---|
| int_tgt | 11,235.87 | 8,491.80 | 10,000.00 | 0.00 | 500,000.00 | 13.30 |
| cnt_tgt | 0.31 | 0.70 | 0.00 | 0.00 | 6.00 | 2.40 |
# Function to compute summary statistics for a binary variable
compute_binary_summary <- function(data, binary_var) {
binary_data <- data[[binary_var]]
# Remove missing values
binary_data <- binary_data[!is.na(binary_data)]
total_n <- length(binary_data)
n_ones <- sum(binary_data == 1)
n_zeros <- sum(binary_data == 0)
prop_ones <- round(n_ones / total_n, 2)
prop_zeros <- round(n_zeros / total_n, 2)
mode_val <- ifelse(n_ones >= n_zeros, 1, 0)
summary_df <- data.frame(
Variable = binary_var,
Count = total_n,
Count_1s = n_ones,
Count_0s = n_zeros,
Proportion_1s = prop_ones,
Proportion_0s = prop_zeros,
Mode = mode_val
)
return(summary_df)
}
# Run for binary variable b_tgt
binary_summary <- compute_binary_summary(bank, "b_tgt")
# Display nicely formatted HTML table
binary_summary %>%
kable("html", caption = "Summary Statistics for Binary Target Variable (b_tgt)") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
| Variable | Count | Count_1s | Count_0s | Proportion_1s | Proportion_0s | Mode |
|---|---|---|---|---|---|---|
| b_tgt | 1060038 | 211509 | 848529 | 0.2 | 0.8 | 0 |
# Load necessary library
library(nortest)
# Formal Normality Testing (Large Sample Safe)
for (var in tgt_vars) {
cat("\nAnderson-Darling Test for", var, "\n")
print(ad.test(bank[[var]]))
cat("\nKolmogorov-Smirnov Test for", var, "\n")
# Standardize the variable first
var_std <- scale(bank[[var]])
print(ks.test(var_std, "pnorm"))
}
##
## Anderson-Darling Test for int_tgt
##
## Anderson-Darling normality test
##
## data: bank[[var]]
## A = 4250.3, p-value < 2.2e-16
##
##
## Kolmogorov-Smirnov Test for int_tgt
## Warning in ks.test.default(var_std, "pnorm"): ties should not be present for
## the one-sample Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: var_std
## D = 0.11227, p-value < 2.2e-16
## alternative hypothesis: two-sided
##
##
## Anderson-Darling Test for cnt_tgt
##
## Anderson-Darling normality test
##
## data: bank[[var]]
## A = 242695, p-value < 2.2e-16
##
##
## Kolmogorov-Smirnov Test for cnt_tgt
## Warning in ks.test.default(var_std, "pnorm"): ties should not be present for
## the one-sample Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: var_std
## D = 0.47252, p-value < 2.2e-16
## alternative hypothesis: two-sided
int_tgt:
cnt_tgt:
Both tests strongly reject the null hypothesis of normality (p-value
<< 0.05), indicating that neither int_tgt nor
cnt_tgt follows a normal distribution.
int_tgt:
cnt_tgt:
Similar to the Anderson-Darling test, the Kolmogorov-Smirnov test results indicate significant deviation from a normal distribution. Note: The KS test assumes no ties in the data, and warnings were issued, but given the extremely small p-values, the evidence against normality remains very strong.
Given the strong evidence of non-normality for both
int_tgt and cnt_tgt:
Additionally: - cnt_tgt, being a count variable, may be
better modeled with Poisson regression or Negative Binomial regression
depending on dispersion. - If cnt_tgt has a high proportion
of zeros, Zero-Inflated models could also be appropriate.
# List of categorical variable names excluding b_tgt
categorical_vars <- c("cat_input1", "cat_input2", "demog_ho", "demog_genf")
# Function to compute frequency tables and create bar charts
compute_frequency_and_plot <- function(bank, categorical_vars) {
# Loop through each variable
lapply(categorical_vars, function(var) {
# Frequency table for the variable
freq_table <- bank %>%
dplyr::select(dplyr::all_of(var)) %>%
dplyr::filter(!is.na(.data[[var]])) %>%
dplyr::group_by(.data[[var]]) %>%
dplyr::summarise(Count = n(), .groups = "drop") %>%
dplyr::mutate(
Level = as.character(.data[[var]]), # Ensure Level is character
Percent = round(100 * Count / sum(Count), 2),
Variable = var
) %>%
dplyr::select(Variable, Level, Count, Percent)
# Display frequency table
print(freq_table)
# Plot bar chart for the variable
ggplot(freq_table, aes(x = Level, y = Count, fill = Level)) +
geom_bar(stat = "identity", show.legend = FALSE) +
theme_minimal() +
labs(title = paste("Frequency of", var), x = "Category", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
})
}
# Run the function for categorical variables
compute_frequency_and_plot(bank, categorical_vars)
## # A tibble: 3 × 4
## Variable Level Count Percent
## <chr> <chr> <int> <dbl>
## 1 cat_input1 X 831371 78.4
## 2 cat_input1 Y 77847 7.34
## 3 cat_input1 Z 150820 14.2
## # A tibble: 5 × 4
## Variable Level Count Percent
## <chr> <chr> <int> <dbl>
## 1 cat_input2 A 188398 17.8
## 2 cat_input2 B 192382 18.2
## 3 cat_input2 C 169550 16.0
## 4 cat_input2 D 122282 11.5
## 5 cat_input2 E 387426 36.6
## # A tibble: 2 × 4
## Variable Level Count Percent
## <chr> <chr> <int> <dbl>
## 1 demog_ho 0 476741 45.0
## 2 demog_ho 1 583297 55.0
## # A tibble: 2 × 4
## Variable Level Count Percent
## <chr> <chr> <int> <dbl>
## 1 demog_genf 0 464288 43.8
## 2 demog_genf 1 595750 56.2
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
We analyzed the categorical predictors in the dataset to understand the number of categories each predictor has and how these categories are distributed. Below are the findings for each of the predictors:
The following table provides a summary of the range of values for the key numerical predictors in the dataset from prior code:
| Variable | Range |
|---|---|
| int_tgt | 0.00 to 500,000 |
| cnt_tgt | 0.00 to 6.00 |
| rfm1 | 0.00 to 3,713.31 |
| rfm2 | 1.58 to 650.00 |
| rfm3 | 0.00 to 3,713.31 |
| rfm4 | 0.00 to 10,000.00 |
| rfm5 | 0.00 to 18.00 |
| rfm6 | 0.00 to 127.00 |
| rfm7 | 0.00 to 11.00 |
| rfm8 | 0.00 to 46.00 |
| rfm9 | 2.00 to 29.00 |
| rfm10 | 0.00 to 77.00 |
| rfm11 | 0.00 to 22.00 |
| rfm12 | 0.00 to 571.00 |
| demog_age | -1.00 to 89.00 |
| demog_homeval | 0.00 to 600,067.00 |
| demog_inc | 0.00 to 200,007.00 |
Wide Range of Values: Variables such as int_tgt, rfm1, rfm3, rfm4, demog_homeval, and demog_inc exhibit very large ranges, suggesting a high degree of variability. This might indicate the presence of extreme outliers that could skew analyses and should be checked further.
Narrower Range: Variables like cnt_tgt and rfm5 have narrower ranges, indicating less variation and potential stability across customers. These could serve as stable predictors in models.
Data Quality Issues: The variable demog_age includes a negative value (-1.00), which likely represents missing or invalid data. This should be addressed during data cleaning.
The following table highlights some key correlation values between numerical predictors:
| Variable1 | Variable2 | Correlation |
|---|---|---|
| rfm1 | rfm2 | 0.91 |
| rfm1 | rfm3 | 0.96 |
| rfm2 | rfm3 | 0.88 |
| rfm5 | cnt_tgt | 0.79 |
| rfm6 | rfm8 | 0.89 |
Given the very strong correlations observed between several predictors (e.g., rfm1 and rfm3), we must be cautious about potential multicollinearity. Multicollinearity can distort model interpretations by inflating standard errors and making coefficient estimates unreliable.
The strong correlation between some of the predictors and the target variable (cnt_tgt) suggests that these features should be prioritized for model training. Notably, rfm5 (Count Purchased Past 3 Years) shows a correlation of 0.79 with cnt_tgt, making it a key predictor.
When selecting models, it’s important to consider how multicollinearity might impact performance, especially for linear models. Tree-based models like Random Forest or Gradient Boosting Machines (GBM) are robust to multicollinearity and can handle correlated features without significant issues.
However, if model interpretability is a priority, addressing multicollinearity should be part of the preprocessing steps to ensure clearer insights into predictor relationships and model coefficients.
A major observation in the data is the extremely high correlation among several of the RFM variables, particularly rfm1, rfm2, and rfm3, with correlations ranging from 0.88 to 0.96. These variables are related to different aspects of sales and promotional response over time, leading to high multicollinearity.
Several predictors, such as int_tgt, rfm1, rfm3, rfm4, and demog_homeval, exhibit skewed distributions with heavy tails (e.g., int_tgt has a skewness of 13.30). This could indicate the presence of outliers or extreme values that may disproportionately influence model training.
Some variables like demog_homeval and demog_inc have large ranges, spanning from zero to very large values. These wide ranges may lead to challenges in models that rely on scale-sensitive techniques (e.g., linear models, SVMs).
The target variable cnt_tgt (Count of New Product Purchased) has a wide range but a relatively low mean and a moderate skewness (2.40). This suggests that most customers purchase a small number of products, but there are a few customers with very high counts.
Demographic variables like demog_age, demog_homeval, and demog_inc have important implications for feature engineering. For example, demog_age exhibits a moderate level of skewness and includes negative values (likely indicating missing or erroneous data), which suggests that this variable needs to be cleaned and transformed.
Given the strong correlation between some of the RFM variables and the target variable cnt_tgt (e.g., rfm5 and cnt_tgt have a correlation of 0.79), it is essential to focus on features that exhibit strong predictive power.